########################################################
## PROGRAM NAME: 003_counties.R                       ##
## AUTHOR: MATT MLECZKO                               ##
## INPUTS:                                            ##
##         001_msa_del.Rda                            ##
##         Five-year ACS estimates                    ##
##         (2005-2009, 2018-2022)                     ##
##         via tidycensus package                     ##
##                                                    ##
## OUTPUTS:                                           ##
##        003_ct_cs_t_cnty_xwalk.Rda                  ##
##        003_msa_2009.Rda                            ##
##        003_msa_2022.Rda                            ##
##        003_msa_0922.Rda                            ##
##                                                    ## 
## PURPOSE: Download/ process county-level            ##
##          ACS data, create MSA-level file           ##
##                                                    ##
## LIST OF UPDATES:                                   ##
##                                                    ##
########################################################

#log <- file(# USER DEFINED PATH AND FILE NAME HERE #)
#sink(log, append=TRUE)
#sink(log, append=TRUE, type="message")

## load libraries ##

library(tidycensus)
library(tigris)
library(tidyr)
library(dplyr)
library(foreign)
library(stringr)
library(tm)
library(gdata)
library(gsubfn)
library(haven)
library(readxl)
library(data.table)

## define paths ##
input_path <- # USER DEFINED PATH HERE #
output_path <- # USER DEFINED PATH HERE #

## set working directory ##
setwd(input_path)

`%notin%` <- Negate(`%in%`)

## create a merge function that creates merge frequency as in Stata ##
## from user rwbuie at stackoverflow: https://stackoverflow.com/questions/30358401/is-there-a-way-to-create-statas-merge-indicator-variable-with-rs-merge ##
stata.merge <- function(x,y, name){
  x$df1 <- 1
  y$df2 <- 2
  df <- merge(x,y, by = name, all = TRUE)
  df$merge.variable <- rowSums(df[,c("df1", "df2")], na.rm=TRUE)
  df$df1 <- NULL
  df$df2<- NULL
  df
  #print(table(df$merge.variable))
  
  ## return the merged dataframe
  return(df)
}

## input Census key ##

#census_api_key(# USER CENSUS API KEY HERE #)

## states to loop through ##
states <- c("AL","AK","AZ","AR","CA","CO","CT","DE",
            "DC","FL","GA","HI","ID","IL","IN","IA","KS",
            "KY","LA","ME","MD","MA","MI","MN","MS","MO",
            "MT","NE","NV","NH","NJ","NM","NY","NC","ND",
            "OH","OK","OR","PA","RI","SC","SD","TN","TX",
            "UT","VT","VA","WA","WV","WI","WY")

load(paste0(output_path,
            "001_msa_del.Rda"))

######################
## process ACS data ##
######################

v2009 <- load_variables(2009, "acs5")

## 2005-2009 counties ##

all.cnty.acs.2009 <- get_acs(geography = "county",
                             variables = c(pop_total = "B01003_001",
                                           race_den = "B02001_001",
                                           pop_white = "B02001_002",
                                           pop_black = "B02001_003",
                                           pop_aian = "B02001_004",
                                           pop_asian = "B02001_005",
                                           pop_nhpi = "B02001_006",
                                           pop_other = "B02001_007",
                                           pop_multi = "B02001_008",
                                           pop_multi_oth = "B02001_009",
                                           pop_multi_noth = "B02001_010",
                                           hisp_race_den = "B03002_001",
                                           pop_nh = "B03002_002",
                                           pop_nh_white = "B03002_003",
                                           pop_nh_black = "B03002_004",
                                           pop_nh_aian = "B03002_005",
                                           pop_nh_asian = "B03002_006",
                                           pop_nh_nhpi = "B03002_007",
                                           pop_nh_other = "B03002_008",
                                           pop_nh_multi = "B03002_009",
                                           pop_nh_multi_oth = "B03002_010",
                                           pop_nh_multi_noth ="B03002_011",
                                           pop_h = "B03002_012",
                                           pop_h_white = "B03002_013",
                                           pop_h_black = "B03002_014",
                                           pop_h_aian = "B03002_015",
                                           pop_h_asian = "B03002_016",
                                           pop_h_nhpi = "B03002_017",
                                           pop_h_other = "B03002_018",
                                           pop_h_multi = "B03002_019",
                                           pop_h_multi_oth = "B03002_020",
                                           pop_h_multi_noth = "B03002_021",
                                           age_den = "B01001_001",
                                           tot_male = "B01001_002",
                                           male_lt5 = "B01001_003",
                                           male_5t9 = "B01001_004",
                                           male_10t14 = "B01001_005",
                                           male_15t17 = "B01001_006",
                                           male_18t19 = "B01001_007",
                                           male_20 = "B01001_008",
                                           male_21 = "B01001_009",
                                           male_22t24 = "B01001_010",
                                           male_25t29 = "B01001_011",
                                           male_30t34 = "B01001_012",
                                           male_35t39 = "B01001_013",
                                           male_40t44 = "B01001_014",
                                           male_45t49 = "B01001_015",
                                           male_50t54 = "B01001_016",
                                           male_55t59 = "B01001_017",
                                           male_60t61 = "B01001_018",
                                           male_62t64 = "B01001_019",
                                           male_65t66 = "B01001_020",
                                           male_67t69 = "B01001_021",
                                           male_70t74 = "B01001_022",
                                           male_75t79 = "B01001_023",
                                           male_80t84 = "B01001_024",
                                           male_85over = "B01001_025",
                                           tot_female = "B01001_026",
                                           female_lt5 = "B01001_027",
                                           female_5t9 = "B01001_028",
                                           female_10t14 = "B01001_029",
                                           female_15t17 = "B01001_030",
                                           female_18t19 = "B01001_031",
                                           female_20 = "B01001_032",
                                           female_21 = "B01001_033",
                                           female_22t24 = "B01001_034",
                                           female_25t29 = "B01001_035",
                                           female_30t34 = "B01001_036",
                                           female_35t39 = "B01001_037",
                                           female_40t44 = "B01001_038",
                                           female_45t49 = "B01001_039",
                                           female_50t54 = "B01001_040",
                                           female_55t59 = "B01001_041",
                                           female_60t61 = "B01001_042",
                                           female_62t64 = "B01001_043",
                                           female_65t66 = "B01001_044",
                                           female_67t69 = "B01001_045",
                                           female_70t74 = "B01001_046",
                                           female_75t79 = "B01001_047",
                                           female_80t84 = "B01001_048",
                                           female_85over = "B01001_049",
                                           median_age = "B01002_001",
                                           hhlds_child_den = "B11005_001",
                                           hhlds_child_num = "B11005_002",
                                           hhld_move_den = "B07003_001",
                                           hhld_nomove = "B07003_004",
                                           foreign_den = "B05002_001",
                                           foreign_born = "B05002_013",
                                           hhs = "B25003_001",
                                           oo_hu = "B25003_002",
                                           ro_hu = "B25003_003",
                                           median_gross_rent = "B25064_001",
                                           median_prop_value = "B25077_001",
                                           median_yr_str = "B25035_001",
                                           total_hu = "B25002_001",
                                           total_vacant = "B25002_003",
                                           median_hhld_inc = "B19013_001",
                                           hhld_pov_den = "B17017_001",
                                           hhld_pov_num = "B17017_002",
                                           ed_den = "B15002_001",
                                           ed_tot_m = "B15002_002",
                                           ed_none_m = "B15002_003",
                                           ed_n_4_m = "B15002_004",
                                           ed_56_m = "B15002_005",
                                           ed_78_m = "B15002_006",
                                           ed_9_m = "B15002_007",
                                           ed_10_m = "B15002_008",
                                           ed_11_m = "B15002_009",
                                           ed_12_nd_m = "B15002_010",
                                           ed_12_d_m = "B15002_011",
                                           ed_scl1_m = "B15002_012",
                                           ed_sc1m_m = "B15002_013",
                                           ed_as_m = "B15002_014",
                                           ed_ba_m = "B15002_015",
                                           ed_ma_m = "B15002_016",
                                           ed_pd_m = "B15002_017",
                                           ed_phd_m = "B15002_018",
                                           ed_tot_f = "B15002_019",
                                           ed_none_f = "B15002_020",
                                           ed_n_4_f = "B15002_021",
                                           ed_56_f = "B15002_022",
                                           ed_78_f = "B15002_023",
                                           ed_9_f = "B15002_024",
                                           ed_10_f = "B15002_025",
                                           ed_11_f = "B15002_026",
                                           ed_12_nd_f = "B15002_027",
                                           ed_12_d_f = "B15002_028",
                                           ed_scl1_f = "B15002_029",
                                           ed_sc1m_f = "B15002_030",
                                           ed_as_f = "B15002_031",
                                           ed_ba_f = "B15002_032",
                                           ed_ma_f = "B15002_033",
                                           ed_pd_f = "B15002_034",
                                           ed_phd_f = "B15002_035",
                                           #gini = "B19083_001",
                                           mlf_1619 = "B23001_006",
                                           mup_1619 = "B23001_008",
                                           mlf_2021 = "B23001_013",
                                           mup_2021 = "B23001_015",
                                           mlf_2224 = "B23001_020",
                                           mup_2224 = "B23001_022",
                                           mlf_2529 = "B23001_027",
                                           mup_2529 = "B23001_029",
                                           mlf_3034 = "B23001_034",
                                           mup_3034 = "B23001_036",
                                           mlf_3544 = "B23001_041",
                                           mup_3544 = "B23001_043",
                                           mlf_4554 = "B23001_048",
                                           mup_4554 = "B23001_050",
                                           mlf_5559 = "B23001_055",
                                           mup_5559 = "B23001_057",
                                           mlf_6061 = "B23001_062",
                                           mup_6061 = "B23001_064",
                                           mlf_6264 = "B23001_069",
                                           mup_6264 = "B23001_071",
                                           mlf_6569 = "B23001_074",
                                           mup_6569 = "B23001_076",
                                           mlf_7074 = "B23001_079",
                                           mup_7074 = "B23001_081",
                                           mlf_75u = "B23001_084",
                                           mup_75u = "B23001_086",
                                           flp_1619 = "B23001_092",
                                           fup_1619 = "B23001_094",
                                           flp_2021 = "B23001_099",
                                           fup_2021 = "B23001_101",
                                           flp_2224 = "B23001_106",
                                           fup_2224 = "B23001_108",
                                           flp_2529 = "B23001_113",
                                           fup_2529 = "B23001_115",
                                           flp_3034 = "B23001_120",
                                           fup_3034 = "B23001_122",
                                           flp_3544 = "B23001_127",
                                           fup_3544 = "B23001_129",
                                           flp_4554 = "B23001_134",
                                           fup_4554 = "B23001_136",
                                           flp_5559 = "B23001_141",
                                           fup_5559 = "B23001_143",
                                           flp_6061 = "B23001_148",
                                           fup_6061 = "B23001_150",
                                           flp_6264 = "B23001_155",
                                           fup_6264 = "B23001_157",
                                           flp_6569 = "B23001_160",
                                           fup_6569 = "B23001_162",
                                           flp_7074 = "B23001_165",
                                           fup_7074 = "B23001_167",
                                           flp_75u = "B23001_170",
                                           fup_75u = "B23001_172",
                                           all_rentals = "B25063_001",
                                           rent_cr = "B25063_002",
                                           rent_cr_100 = "B25063_003",
                                           rent_cr_149 = "B25063_004",
                                           rent_cr_199 = "B25063_005",
                                           rent_cr_249 = "B25063_006",
                                           rent_cr_299 = "B25063_007",
                                           rent_cr_349 = "B25063_008",
                                           rent_cr_399 = "B25063_009",
                                           rent_cr_449 = "B25063_010",
                                           rent_cr_499 = "B25063_011",
                                           rent_cr_549 = "B25063_012",
                                           rent_cr_599 = "B25063_013",
                                           rent_cr_649 = "B25063_014",
                                           rent_cr_699 = "B25063_015",
                                           rent_cr_749 = "B25063_016",
                                           rent_cr_799 = "B25063_017",
                                           rent_cr_899 = "B25063_018",
                                           rent_cr_999 = "B25063_019",
                                           rent_cr_1249 = "B25063_020",
                                           rent_cr_1499 = "B25063_021",
                                           rent_cr_1999 = "B25063_022",
                                           rent_cr_2000 = "B25063_023"),
                             survey = "acs5",
                             output = "wide",
                             year = 2009)

## add in Gini index and land area from Social Explorer ## 

cnty.oth.2009.in <- read.csv("R13374396_SL050.csv",
                             header=T,
                             stringsAsFactors = F)

cnty.oth.2009 <- cnty.oth.2009.in %>%
  select(Geo_FIPS,
         SE_A00003_002,
         SE_A14028_001) %>%
  mutate(GEOID = str_pad(Geo_FIPS, 5, "left", "0")) %>%
  rename(land_area_2009 = SE_A00003_002,
         gini_2009 = SE_A14028_001) %>%
  select(GEOID,
         land_area_2009,
         gini_2009)

## combine and clean ##

all.cnty.acs.2009.pr <- all.cnty.acs.2009 %>%
  select(-ends_with("M")) %>%
  rename_with(~str_remove(., 'E')) %>%
  rename(GEOID = GOID) %>%
  rename_at(vars(-GEOID,-NAM),function(x) paste0(x,"_2009")) %>%
  inner_join(cnty.oth.2009, "GEOID") %>%
  ## adjust $ vars for inflation (All Items CPI-U-RS Annual Averages) ##
  mutate(median_gross_rent_2009 = median_gross_rent_2009 * 437.6/316.7,
         median_prop_value_2009 = median_prop_value_2009 * 437.6/316.7,
         median_hhld_inc_2009 = median_hhld_inc_2009 * 437.6/316.7)

print("SAME OBS AFTER JOIN?")
nrow(all.cnty.acs.2009) == nrow(all.cnty.acs.2009.pr)

## prepare for aggregation ##

metro.cnty.2009 <- all.cnty.acs.2009.pr %>%
  rename(FIPS = GEOID) %>%
  inner_join(msa.del.2020.rd, "FIPS") %>%
  rename(CBSA = `CBSA Code`) %>%
  group_by(CBSA) %>%
  mutate(pop_wt_2009 = pop_total_2009/sum(pop_total_2009)) %>%
  ungroup()

## aggregate county to msa ## 

cnty.msa.2009.c1 <- metro.cnty.2009 %>%
  select(-median_gross_rent_2009,
         -median_prop_value_2009,
         -median_hhld_inc_2009,
         -median_age_2009,
         -gini_2009,
         -median_yr_str_2009) %>%
  group_by(CBSA) %>%
  summarize_at(vars(pop_total_2009:land_area_2009), sum, na.rm=T) %>%
  ungroup() 

## now, weighted vars ##
cnty.msa.2009.c2 <- metro.cnty.2009 %>%
  select(CBSA,
         pop_wt_2009,
         median_gross_rent_2009,
         median_prop_value_2009,
         median_hhld_inc_2009,
         median_age_2009,
         median_yr_str_2009,
         gini_2009) %>%
  group_by(CBSA) %>%
  summarize_at(vars(median_gross_rent_2009:gini_2009),
               funs(weighted.mean(., pop_wt_2009, na.rm=T))) %>%
  ungroup()


summary(cnty.msa.2009.c2)

## now, combine ##

msa.2009.m <- stata.merge(cnty.msa.2009.c1,
                          cnty.msa.2009.c2,
                          "CBSA")

## check merge ##
table(msa.2009.m$merge.variable, useNA = "ifany")

## keep matches ##

msa.2009 <- msa.2009.m %>%
  filter(merge.variable == 3) %>%
  select(-merge.variable)

## save file ##

save(msa.2009,
     file = paste0(output_path,
                   "003_msa_2009.Rda"))

## 2018-2022 counties ##

v2022 <- load_variables(2022, "acs5")

all.cnty.acs.2022 <- get_acs(geography = "county",
                             variables = c(pop_total = "B01003_001",
                                           race_den = "B02001_001",
                                           pop_white = "B02001_002",
                                           pop_black = "B02001_003",
                                           pop_aian = "B02001_004",
                                           pop_asian = "B02001_005",
                                           pop_nhpi = "B02001_006",
                                           pop_other = "B02001_007",
                                           pop_multi = "B02001_008",
                                           pop_multi_oth = "B02001_009",
                                           pop_multi_noth = "B02001_010",
                                           hisp_race_den = "B03002_001",
                                           pop_nh = "B03002_002",
                                           pop_nh_white = "B03002_003",
                                           pop_nh_black = "B03002_004",
                                           pop_nh_aian = "B03002_005",
                                           pop_nh_asian = "B03002_006",
                                           pop_nh_nhpi = "B03002_007",
                                           pop_nh_other = "B03002_008",
                                           pop_nh_multi = "B03002_009",
                                           pop_nh_multi_oth = "B03002_010",
                                           pop_nh_multi_noth ="B03002_011",
                                           pop_h = "B03002_012",
                                           pop_h_white = "B03002_013",
                                           pop_h_black = "B03002_014",
                                           pop_h_aian = "B03002_015",
                                           pop_h_asian = "B03002_016",
                                           pop_h_nhpi = "B03002_017",
                                           pop_h_other = "B03002_018",
                                           pop_h_multi = "B03002_019",
                                           pop_h_multi_oth = "B03002_020",
                                           pop_h_multi_noth = "B03002_021",
                                           age_den = "B01001_001",
                                           tot_male = "B01001_002",
                                           male_5u = "B01001_003",
                                           male_5t9 = "B01001_004",
                                           male_10t14 = "B01001_005",
                                           male_15t17 = "B01001_006",
                                           male_18t19 = "B01001_007",
                                           male_20 = "B01001_008",
                                           male_21 = "B01001_009",
                                           male_22t24 = "B01001_010",
                                           male_25t29 = "B01001_011",
                                           male_30t34 = "B01001_012",
                                           male_35t39 = "B01001_013",
                                           male_40t44 = "B01001_014",
                                           male_45t49 = "B01001_015",
                                           male_50t54 = "B01001_016",
                                           male_55t59 = "B01001_017",
                                           male_60t61 = "B01001_018",
                                           male_62t64 = "B01001_019",
                                           male_65t66 = "B01001_020",
                                           male_67t69 = "B01001_021",
                                           male_70t74 = "B01001_022",
                                           male_75t79 = "B01001_023",
                                           male_80t84 = "B01001_024",
                                           male_85over = "B01001_025",
                                           tot_female = "B01001_026",
                                           female_5u = "B01001_027",
                                           female_5t9 = "B01001_028",
                                           female_10t14 = "B01001_029",
                                           female_15t17 = "B01001_030",
                                           female_18t19 = "B01001_031",
                                           female_20 = "B01001_032",
                                           female_21 = "B01001_033",
                                           female_22t24 = "B01001_034",
                                           female_25t29 = "B01001_035",
                                           female_30t34 = "B01001_036",
                                           female_35t39 = "B01001_037",
                                           female_40t44 = "B01001_038",
                                           female_45t49 = "B01001_039",
                                           female_50t54 = "B01001_040",
                                           female_55t59 = "B01001_041",
                                           female_60t61 = "B01001_042",
                                           female_62t64 = "B01001_043",
                                           female_65t66 = "B01001_044",
                                           female_67t69 = "B01001_045",
                                           female_70t74 = "B01001_046",
                                           female_75t79 = "B01001_047",
                                           female_80t84 = "B01001_048",
                                           female_85over = "B01001_049",
                                           median_age = "B01002_001",
                                           hhlds_child_den = "B11005_001",
                                           hhlds_child_num = "B11005_002",
                                           hhld_move_den = "B07001_001",
                                           hhld_nomove = "B07001_017",
                                           foreign_den = "B05002_001",
                                           foreign_born = "B05002_013",
                                           hhs = "B25003_001",
                                           oo_hu = "B25003_002",
                                           ro_hu = "B25003_003",
                                           median_gross_rent = "B25064_001",
                                           median_prop_value = "B25077_001",
                                           median_yr_str = "B25035_001",
                                           total_hu = "B25002_001",
                                           total_vacant = "B25002_003",
                                           median_hhld_inc = "B19013_001",
                                           hhld_pov_den = "B17017_001",
                                           hhld_pov_num = "B17017_002",
                                           ed_den = "B15003_001",
                                           ed_none = "B15003_002",
                                           ed_nurse = "B15003_003",
                                           ed_k = "B15003_004",
                                           ed_1 = "B15003_005",
                                           ed_2 = "B15003_006",
                                           ed_3 = "B15003_007",
                                           ed_4 = "B15003_008",
                                           ed_5 = "B15003_009",
                                           ed_6 = "B15003_010",
                                           ed_7 = "B15003_011",
                                           ed_8 = "B15003_012",
                                           ed_9 = "B15003_013",
                                           ed_10 = "B15003_014",
                                           ed_11 = "B15003_015",
                                           ed_12_nd = "B15003_016",
                                           ed_12_d = "B15003_017",
                                           ed_ged = "B15003_018",
                                           ed_sc1l = "B15003_019",
                                           ed_sc1m = "B15003_020",
                                           ed_as = "B15003_021",
                                           ed_ba = "B15003_022",
                                           ed_ma = "B15003_023",
                                           ed_pd = "B15003_024",
                                           ed_phd = "B15003_025",
                                           gini = "B19083_001",
                                           mlf_1619 = "B23001_006",
                                           mup_1619 = "B23001_008",
                                           mlf_2021 = "B23001_013",
                                           mup_2021 = "B23001_015",
                                           mlf_2224 = "B23001_020",
                                           mup_2224 = "B23001_022",
                                           mlf_2529 = "B23001_027",
                                           mup_2529 = "B23001_029",
                                           mlf_3034 = "B23001_034",
                                           mup_3034 = "B23001_036",
                                           mlf_3544 = "B23001_041",
                                           mup_3544 = "B23001_043",
                                           mlf_4554 = "B23001_048",
                                           mup_4554 = "B23001_050",
                                           mlf_5559 = "B23001_055",
                                           mup_5559 = "B23001_057",
                                           mlf_6061 = "B23001_062",
                                           mup_6061 = "B23001_064",
                                           mlf_6264 = "B23001_069",
                                           mup_6264 = "B23001_071",
                                           mlf_6569 = "B23001_074",
                                           mup_6569 = "B23001_076",
                                           mlf_7074 = "B23001_079",
                                           mup_7074 = "B23001_081",
                                           mlf_75u = "B23001_084",
                                           mup_75u = "B23001_086",
                                           flp_1619 = "B23001_092",
                                           fup_1619 = "B23001_094",
                                           flp_2021 = "B23001_099",
                                           fup_2021 = "B23001_101",
                                           flp_2224 = "B23001_106",
                                           fup_2224 = "B23001_108",
                                           flp_2529 = "B23001_113",
                                           fup_2529 = "B23001_115",
                                           flp_3034 = "B23001_120",
                                           fup_3034 = "B23001_122",
                                           flp_3544 = "B23001_127",
                                           fup_3544 = "B23001_129",
                                           flp_4554 = "B23001_134",
                                           fup_4554 = "B23001_136",
                                           flp_5559 = "B23001_141",
                                           fup_5559 = "B23001_143",
                                           flp_6061 = "B23001_148",
                                           fup_6061 = "B23001_150",
                                           flp_6264 = "B23001_155",
                                           fup_6264 = "B23001_157",
                                           flp_6569 = "B23001_160",
                                           fup_6569 = "B23001_162",
                                           flp_7074 = "B23001_165",
                                           fup_7074 = "B23001_167",
                                           flp_75u = "B23001_170",
                                           fup_75u = "B23001_172",
                                           all_rentals = "B25063_001",
                                           rent_cr = "B25063_002",
                                           rent_cr_100 = "B25063_003",
                                           rent_cr_149 = "B25063_004",
                                           rent_cr_199 = "B25063_005",
                                           rent_cr_249 = "B25063_006",
                                           rent_cr_299 = "B25063_007",
                                           rent_cr_349 = "B25063_008",
                                           rent_cr_399 = "B25063_009",
                                           rent_cr_449 = "B25063_010",
                                           rent_cr_499 = "B25063_011",
                                           rent_cr_549 = "B25063_012",
                                           rent_cr_599 = "B25063_013",
                                           rent_cr_649 = "B25063_014",
                                           rent_cr_699 = "B25063_015",
                                           rent_cr_749 = "B25063_016",
                                           rent_cr_799 = "B25063_017",
                                           rent_cr_899 = "B25063_018",
                                           rent_cr_999 = "B25063_019",
                                           rent_cr_1249 = "B25063_020",
                                           rent_cr_1499 = "B25063_021",
                                           rent_cr_1999 = "B25063_022",
                                           rent_cr_2499 = "B25063_023",
                                           rent_cr_2999 = "B25063_024",
                                           rent_cr_3499 = "B25063_025",
                                           rent_cr_3500 = "B25063_026"),
                             survey = "acs5",
                             output = "wide",
                             year = 2022)

## get land area info ## 

cnty2022.area <- counties(year = 2022,
                          cb=TRUE)

## attach land area values - places ##

cnty2022.area.fin <- as.data.frame(cnty2022.area) %>%
  select(GEOID, 
         ALAND,
         AWATER) %>%
  mutate(land_area_2022_raw = as.numeric(ALAND),
         land_area_2022 = land_area_2022_raw/2589988) %>%
  select(GEOID,
         land_area_2022)

class(cnty2022.area.fin)

## combine and clean ## 

all.cnty.acs.2022.pr <- all.cnty.acs.2022 %>%
  select(-ends_with("M")) %>%
  rename_with(~str_remove(., 'E')) %>%
  rename(GEOID = GOID,
         NAME = NAM) %>%
  rename_at(vars(-GEOID,-NAME),function(x) paste0(x,"_2022")) %>%
  inner_join(cnty2022.area.fin, "GEOID")
 
print("SAME OBS AFTER JOIN?")
nrow(all.cnty.acs.2022) == nrow(all.cnty.acs.2022.pr)

## drop CT counties ## 

all.cnty.acs.2022.pr.noct <- all.cnty.acs.2022.pr %>%
  filter(!grepl("Connecticut", NAME))

sum(grepl("Connecticut", all.cnty.acs.2022.pr$NAME))
nrow(all.cnty.acs.2022.pr.noct) == nrow(all.cnty.acs.2022.pr) - sum(grepl("Connecticut", all.cnty.acs.2022.pr$NAME))

## read in CT county to cousub xwalk ## 

ct.cs.in <- read_xlsx("ct_cou_to_cousub_crosswalk.xlsx")

names(ct.cs.in) <- c("sffips_old",
                     "cfips_old",
                     "county_name_old",
                     "cfips_new",
                     "county_name_new",
                     "csfp_new",
                     "cousub_geoid_old",
                     "cousub_geoid_new",
                     "cousub_name_new",
                     "cousubns",
                     "cousub_lsad",
                     "cousub_funcstat",
                     "cousub_classfp")

ct.cs <- ct.cs.in %>%
  filter(county_name_new != "" & 
         cousub_name_new != "County subdivisions not defined") %>%
  rename(GEOID = cousub_geoid_new)

## save for later programs ##
save(ct.cs,
     file = paste0(output_path,
                   "003_ct_cs_t_cnty_xwalk.Rda"))

## read-in county subs ## 

all.cousub.acs.2022 <- get_acs(geography = "county subdivision",
                               variables = c(pop_total = "B01003_001",
                                           race_den = "B02001_001",
                                           pop_white = "B02001_002",
                                           pop_black = "B02001_003",
                                           pop_aian = "B02001_004",
                                           pop_asian = "B02001_005",
                                           pop_nhpi = "B02001_006",
                                           pop_other = "B02001_007",
                                           pop_multi = "B02001_008",
                                           pop_multi_oth = "B02001_009",
                                           pop_multi_noth = "B02001_010",
                                           hisp_race_den = "B03002_001",
                                           pop_nh = "B03002_002",
                                           pop_nh_white = "B03002_003",
                                           pop_nh_black = "B03002_004",
                                           pop_nh_aian = "B03002_005",
                                           pop_nh_asian = "B03002_006",
                                           pop_nh_nhpi = "B03002_007",
                                           pop_nh_other = "B03002_008",
                                           pop_nh_multi = "B03002_009",
                                           pop_nh_multi_oth = "B03002_010",
                                           pop_nh_multi_noth ="B03002_011",
                                           pop_h = "B03002_012",
                                           pop_h_white = "B03002_013",
                                           pop_h_black = "B03002_014",
                                           pop_h_aian = "B03002_015",
                                           pop_h_asian = "B03002_016",
                                           pop_h_nhpi = "B03002_017",
                                           pop_h_other = "B03002_018",
                                           pop_h_multi = "B03002_019",
                                           pop_h_multi_oth = "B03002_020",
                                           pop_h_multi_noth = "B03002_021",
                                           age_den = "B01001_001",
                                           tot_male = "B01001_002",
                                           male_5u = "B01001_003",
                                           male_5t9 = "B01001_004",
                                           male_10t14 = "B01001_005",
                                           male_15t17 = "B01001_006",
                                           male_18t19 = "B01001_007",
                                           male_20 = "B01001_008",
                                           male_21 = "B01001_009",
                                           male_22t24 = "B01001_010",
                                           male_25t29 = "B01001_011",
                                           male_30t34 = "B01001_012",
                                           male_35t39 = "B01001_013",
                                           male_40t44 = "B01001_014",
                                           male_45t49 = "B01001_015",
                                           male_50t54 = "B01001_016",
                                           male_55t59 = "B01001_017",
                                           male_60t61 = "B01001_018",
                                           male_62t64 = "B01001_019",
                                           male_65t66 = "B01001_020",
                                           male_67t69 = "B01001_021",
                                           male_70t74 = "B01001_022",
                                           male_75t79 = "B01001_023",
                                           male_80t84 = "B01001_024",
                                           male_85over = "B01001_025",
                                           tot_female = "B01001_026",
                                           female_5u = "B01001_027",
                                           female_5t9 = "B01001_028",
                                           female_10t14 = "B01001_029",
                                           female_15t17 = "B01001_030",
                                           female_18t19 = "B01001_031",
                                           female_20 = "B01001_032",
                                           female_21 = "B01001_033",
                                           female_22t24 = "B01001_034",
                                           female_25t29 = "B01001_035",
                                           female_30t34 = "B01001_036",
                                           female_35t39 = "B01001_037",
                                           female_40t44 = "B01001_038",
                                           female_45t49 = "B01001_039",
                                           female_50t54 = "B01001_040",
                                           female_55t59 = "B01001_041",
                                           female_60t61 = "B01001_042",
                                           female_62t64 = "B01001_043",
                                           female_65t66 = "B01001_044",
                                           female_67t69 = "B01001_045",
                                           female_70t74 = "B01001_046",
                                           female_75t79 = "B01001_047",
                                           female_80t84 = "B01001_048",
                                           female_85over = "B01001_049",
                                           median_age = "B01002_001",
                                           hhlds_child_den = "B11005_001",
                                           hhlds_child_num = "B11005_002",
                                           hhld_move_den = "B07001_001",
                                           hhld_nomove = "B07001_017",
                                           foreign_den = "B05002_001",
                                           foreign_born = "B05002_013",
                                           hhs = "B25003_001",
                                           oo_hu = "B25003_002",
                                           ro_hu = "B25003_003",
                                           median_gross_rent = "B25064_001",
                                           median_prop_value = "B25077_001",
                                           median_yr_str = "B25035_001",
                                           total_hu = "B25002_001",
                                           total_vacant = "B25002_003",
                                           median_hhld_inc = "B19013_001",
                                           hhld_pov_den = "B17017_001",
                                           hhld_pov_num = "B17017_002",
                                           ed_den = "B15003_001",
                                           ed_none = "B15003_002",
                                           ed_nurse = "B15003_003",
                                           ed_k = "B15003_004",
                                           ed_1 = "B15003_005",
                                           ed_2 = "B15003_006",
                                           ed_3 = "B15003_007",
                                           ed_4 = "B15003_008",
                                           ed_5 = "B15003_009",
                                           ed_6 = "B15003_010",
                                           ed_7 = "B15003_011",
                                           ed_8 = "B15003_012",
                                           ed_9 = "B15003_013",
                                           ed_10 = "B15003_014",
                                           ed_11 = "B15003_015",
                                           ed_12_nd = "B15003_016",
                                           ed_12_d = "B15003_017",
                                           ed_ged = "B15003_018",
                                           ed_sc1l = "B15003_019",
                                           ed_sc1m = "B15003_020",
                                           ed_as = "B15003_021",
                                           ed_ba = "B15003_022",
                                           ed_ma = "B15003_023",
                                           ed_pd = "B15003_024",
                                           ed_phd = "B15003_025",
                                           gini = "B19083_001",
                                           mlf_1619 = "B23001_006",
                                           mup_1619 = "B23001_008",
                                           mlf_2021 = "B23001_013",
                                           mup_2021 = "B23001_015",
                                           mlf_2224 = "B23001_020",
                                           mup_2224 = "B23001_022",
                                           mlf_2529 = "B23001_027",
                                           mup_2529 = "B23001_029",
                                           mlf_3034 = "B23001_034",
                                           mup_3034 = "B23001_036",
                                           mlf_3544 = "B23001_041",
                                           mup_3544 = "B23001_043",
                                           mlf_4554 = "B23001_048",
                                           mup_4554 = "B23001_050",
                                           mlf_5559 = "B23001_055",
                                           mup_5559 = "B23001_057",
                                           mlf_6061 = "B23001_062",
                                           mup_6061 = "B23001_064",
                                           mlf_6264 = "B23001_069",
                                           mup_6264 = "B23001_071",
                                           mlf_6569 = "B23001_074",
                                           mup_6569 = "B23001_076",
                                           mlf_7074 = "B23001_079",
                                           mup_7074 = "B23001_081",
                                           mlf_75u = "B23001_084",
                                           mup_75u = "B23001_086",
                                           flp_1619 = "B23001_092",
                                           fup_1619 = "B23001_094",
                                           flp_2021 = "B23001_099",
                                           fup_2021 = "B23001_101",
                                           flp_2224 = "B23001_106",
                                           fup_2224 = "B23001_108",
                                           flp_2529 = "B23001_113",
                                           fup_2529 = "B23001_115",
                                           flp_3034 = "B23001_120",
                                           fup_3034 = "B23001_122",
                                           flp_3544 = "B23001_127",
                                           fup_3544 = "B23001_129",
                                           flp_4554 = "B23001_134",
                                           fup_4554 = "B23001_136",
                                           flp_5559 = "B23001_141",
                                           fup_5559 = "B23001_143",
                                           flp_6061 = "B23001_148",
                                           fup_6061 = "B23001_150",
                                           flp_6264 = "B23001_155",
                                           fup_6264 = "B23001_157",
                                           flp_6569 = "B23001_160",
                                           fup_6569 = "B23001_162",
                                           flp_7074 = "B23001_165",
                                           fup_7074 = "B23001_167",
                                           flp_75u = "B23001_170",
                                           fup_75u = "B23001_172",
                                           all_rentals = "B25063_001",
                                           rent_cr = "B25063_002",
                                           rent_cr_100 = "B25063_003",
                                           rent_cr_149 = "B25063_004",
                                           rent_cr_199 = "B25063_005",
                                           rent_cr_249 = "B25063_006",
                                           rent_cr_299 = "B25063_007",
                                           rent_cr_349 = "B25063_008",
                                           rent_cr_399 = "B25063_009",
                                           rent_cr_449 = "B25063_010",
                                           rent_cr_499 = "B25063_011",
                                           rent_cr_549 = "B25063_012",
                                           rent_cr_599 = "B25063_013",
                                           rent_cr_649 = "B25063_014",
                                           rent_cr_699 = "B25063_015",
                                           rent_cr_749 = "B25063_016",
                                           rent_cr_799 = "B25063_017",
                                           rent_cr_899 = "B25063_018",
                                           rent_cr_999 = "B25063_019",
                                           rent_cr_1249 = "B25063_020",
                                           rent_cr_1499 = "B25063_021",
                                           rent_cr_1999 = "B25063_022",
                                           rent_cr_2499 = "B25063_023",
                                           rent_cr_2999 = "B25063_024",
                                           rent_cr_3499 = "B25063_025",
                                           rent_cr_3500 = "B25063_026"),
                             state = "CT",
                             survey = "acs5",
                             output = "wide",
                             year = 2022)

## get land area info ## 

ct.cousub2022.area <- county_subdivisions(year = 2022,
                                          state = "CT",
                                          cb=TRUE)

## attach land area values - places ##

ct.cousub.2022.area.fin <- as.data.frame(ct.cousub2022.area) %>%
  select(GEOID, 
         ALAND,
         AWATER) %>%
  mutate(land_area_2022_raw = as.numeric(ALAND),
         land_area_2022 = land_area_2022_raw/2589988) %>%
  select(GEOID,
         land_area_2022)

class(ct.cousub.2022.area.fin)

## combine and clean ## 

all.cousub.acs.2022.pr <- all.cousub.acs.2022 %>%
  select(-ends_with("M")) %>%
  rename_with(~str_remove(., 'E')) %>%
  rename(GEOID = GOID,
         NAME = NAM) %>%
  rename_at(vars(-GEOID,-NAME),function(x) paste0(x,"_2022")) %>%
  inner_join(ct.cousub.2022.area.fin, "GEOID")

print("SAME OBS AFTER JOIN - SHOULD BE FALSE")
nrow(all.cousub.acs.2022) == nrow(all.cousub.acs.2022.pr)

## the missing obs are invalid ##
ct.miss <- all.cousub.acs.2022 %>%
  anti_join(all.cousub.acs.2022.pr, "GEOID")

## merge the ct xwalk to the ct cousub file ##

length(unique(all.cousub.acs.2022.pr$GEOID)) == nrow(all.cousub.acs.2022.pr)
range(nchar(all.cousub.acs.2022.pr$GEOID))

length(unique(ct.cs$GEOID)) == nrow(ct.cs)
range(nchar(ct.cs$GEOID))

ct.cs.t.county.m <- stata.merge(all.cousub.acs.2022.pr,
                                ct.cs, 
                                "GEOID")

## check merge ##
table(ct.cs.t.county.m$merge.variable, useNA = "ifany")

## aggregate the cousubs to county ##
ct.cs.t.county1 <- ct.cs.t.county.m %>%
  filter(merge.variable == 3) %>%
  select(-GEOID,
         -median_age_2022,
         -median_gross_rent_2022,
         -median_prop_value_2022,
         -median_yr_str_2022,
         -median_hhld_inc_2022,
         -gini_2022,
         -county_name_old,
         -cfips_new,
         -county_name_new,
         -csfp_new,
         -cousub_name_new,
         -cousubns,
         -cousub_lsad,
         -cousub_funcstat,
         -cousub_geoid_old,
         -cousub_classfp) %>%
  mutate(FIPS = paste0(sffips_old,
                       cfips_old)) %>%
  group_by(FIPS) %>%
  summarize_at(vars(pop_total_2022:land_area_2022),
               funs(sum(., na.rm=T))) %>%
  ungroup() %>%
  rename(GEOID = FIPS) 

ct.cs.t.county2 <- ct.cs.t.county.m %>%
  mutate(FIPS = paste0(sffips_old,
                       cfips_old)) %>%
  group_by(FIPS) %>%
  mutate(wt = pop_total_2022/sum(pop_total_2022,na.rm=T)) %>%
  select(FIPS,
         wt,
         median_age_2022,
         median_gross_rent_2022,
         median_prop_value_2022,
         median_yr_str_2022,
         median_hhld_inc_2022,
         gini_2022) %>%
  summarize_at(vars(median_age_2022:gini_2022),
               funs(weighted.mean(., wt, na.rm=T))) %>%
  ungroup() %>%
  rename(GEOID = FIPS)

ct.cs.t.county3 <- ct.cs.t.county.m %>%
  mutate(FIPS = paste0(sffips_old,
                       cfips_old)) %>%
  select(FIPS,
         county_name_old) %>%
  group_by(FIPS) %>%
  summarize(NAME = first(county_name_old), .groups = "drop") %>%
  rename(GEOID = FIPS)

## stack the files ## 
## 9 planning regions -> 8 counties ## 

print("OBS PRIOR TO JOIN")
nrow(ct.cs.t.county1)
nrow(ct.cs.t.county2)
nrow(ct.cs.t.county3)

ct.counties.2022.rf <- ct.cs.t.county1 %>%
  inner_join(ct.cs.t.county2, "GEOID") %>%
  inner_join(ct.cs.t.county3, "GEOID") %>%
  select(GEOID,
         NAME,
         everything())

print("OBS AFTER JOIN")
nrow(ct.counties.2022.rf) 

## bring back the ct counties ## 

all.cnty.acs.2022.cb <- rbind(all.cnty.acs.2022.pr.noct,
                              ct.counties.2022.rf)

## check ##
nrow(all.cnty.acs.2022.cb) == nrow(all.cnty.acs.2022.pr) - 1

## prepare for aggregation ##

metro.cnty.2022 <- all.cnty.acs.2022.cb %>%
  rename(FIPS = GEOID) %>%
  inner_join(msa.del.2020.rd, "FIPS") %>%
  rename(CBSA = `CBSA Code`) %>%
  group_by(CBSA) %>%
  mutate(pop_wt_2022 = pop_total_2022/sum(pop_total_2022)) %>%
  ungroup()

## check CT counties ##
metro.cnty.2022$pop_total_2022[metro.cnty.2022$FIPS == "09001"] # fairfield
metro.cnty.2022$pop_total_2022[metro.cnty.2022$FIPS == "09003"] # hartford 
metro.cnty.2022$pop_total_2022[metro.cnty.2022$FIPS == "09007"] # middlesex
metro.cnty.2022$pop_total_2022[metro.cnty.2022$FIPS == "09009"] # new haven
metro.cnty.2022$pop_total_2022[metro.cnty.2022$FIPS == "09011"] # new london
metro.cnty.2022$pop_total_2022[metro.cnty.2022$FIPS == "09013"] # tolland
metro.cnty.2022$pop_total_2022[metro.cnty.2022$FIPS == "09015"] # windham

## aggregate county to msa ## 

## first, simple variables ##
cnty.msa.2022.c1 <- metro.cnty.2022 %>%
  select(-median_gross_rent_2022,
         -median_prop_value_2022,
         -median_hhld_inc_2022,
         -median_age_2022,
         -median_yr_str_2022,
         -gini_2022) %>%
  group_by(CBSA) %>%
  summarize_at(vars(pop_total_2022:land_area_2022), sum, na.rm=T) %>%
  ungroup()

## now, weighted vars ##
cnty.msa.2022.c2 <- metro.cnty.2022 %>%
  select(CBSA,
         pop_wt_2022,
         median_gross_rent_2022,
         median_prop_value_2022,
         median_hhld_inc_2022,
         median_yr_str_2022,
         median_age_2022,
         gini_2022) %>%
  group_by(CBSA) %>%
  summarize_at(vars(median_gross_rent_2022:gini_2022),
               funs(weighted.mean(., pop_wt_2022, na.rm=T))) %>%
  ungroup() 

summary(cnty.msa.2022.c2)

## now, combine ##

msa.2022.m <- stata.merge(cnty.msa.2022.c1,
                          cnty.msa.2022.c2,
                          "CBSA")

## check merge ##
table(msa.2022.m$merge.variable, useNA = "ifany")

## keep matches ##

msa.2022 <- msa.2022.m %>%
  filter(merge.variable == 3) %>%
  select(-merge.variable)

## save file ##

save(msa.2022,
     file = paste0(output_path,
                   "003_msa_2022.Rda"))

## combine the two msa-level files ##

nrow(msa.2009)
nrow(msa.2022)

ncol(msa.2009)
ncol(msa.2022)

## merge ##

msa.0922.m <- stata.merge(msa.2009,
                          msa.2022,
                          "CBSA")

## check merge ##

table(msa.0922.m$merge.variable, useNA = "ifany")

## keep matches, final processing ##

msa.0922 <- msa.0922.m %>%
  filter(merge.variable == 3) %>%
  select(-merge.variable) %>%
  mutate(pop_density_2009 = pop_total_2009/land_area_2009,
         per_18t34_2009 = rowSums(cbind(male_18t19_2009 , female_18t19_2009 ,male_20_2009 , female_20_2009 ,
                                        male_21_2009 , female_21_2009 , male_22t24_2009 , female_22t24_2009 , 
                                        male_25t29_2009 , female_25t29_2009 , male_30t34_2009 , female_30t34_2009),na.rm=T)/age_den_2009,
         per_35t64_2009 = rowSums(cbind(male_35t39_2009 , female_35t39_2009 , male_40t44_2009 , female_40t44_2009 , 
                                        male_45t49_2009 , female_45t49_2009 , male_50t54_2009 , female_50t54_2009 , 
                                        male_55t59_2009 , female_55t59_2009 , male_60t61_2009 , female_60t61_2009 , 
                                        male_62t64_2009 , female_62t64_2009),na.rm=T)/age_den_2009,
         per_65over_2009 = rowSums(cbind(male_65t66_2009 , female_65t66_2009 , male_67t69_2009 , female_67t69_2009 , 
                                         male_70t74_2009 , female_70t74_2009 , male_75t79_2009 , female_75t79_2009 , 
                                         male_80t84_2009 , female_80t84_2009 , male_85over_2009 , female_85over_2009),na.rm=T)/age_den_2009,
         per_aian_2009 = pop_nh_aian_2009/pop_total_2009,
         per_asian_2009 = pop_nh_asian_2009/pop_total_2009,
         per_black_2009 = pop_nh_black_2009/pop_total_2009,
         per_latino_2009 = pop_h_2009/pop_total_2009,
         per_other_2009 = rowSums(cbind(pop_nh_other_2009,
                                        pop_nh_nhpi_2009,
                                        pop_nh_multi_2009))/pop_total_2009,
         per_white_2009 = pop_nh_white_2009/pop_total_2009,
         log_asian_2009 = case_when(pop_nh_asian_2009 != 0 ~ log(1/per_asian_2009),
                                    pop_nh_asian_2009 == 0 ~ 0),
         log_black_2009 = case_when(pop_nh_black_2009 != 0 ~ log(1/per_black_2009),
                                    pop_nh_black_2009 == 0 ~ 0),
         log_latino_2009 = case_when(pop_h_2009!= 0 ~ log(1/per_latino_2009),
                                     pop_h_2009 == 0 ~ 0),
         log_white_2009 = case_when(pop_nh_white_2009 != 0 ~ log(1/per_white_2009),
                                    pop_nh_white_2009 == 0 ~ 0),
         log_aian_2009 = case_when(pop_nh_aian_2009 != 0 ~ log(1/per_aian_2009),
                                   pop_nh_aian_2009 == 0 ~ 0),
         log_other_2009 = case_when(rowSums(cbind(pop_nh_other_2009,
                                                  pop_nh_nhpi_2009,
                                                  pop_nh_multi_2009),na.rm=T) != 0 ~ log(1/per_other_2009),
                                    rowSums(cbind(pop_nh_other_2009,
                                                  pop_nh_nhpi_2009,
                                                  pop_nh_multi_2009),na.rm=T) == 0 ~ 0),
         entropy_er_2009 = per_asian_2009*log_asian_2009 + 
                           per_black_2009*log_black_2009 + 
                           per_latino_2009*log_latino_2009 + 
                           per_white_2009*log_white_2009 +
                           per_aian_2009*log_aian_2009 +
                           per_other_2009*log_other_2009,
         entropy_ers_2009 = (entropy_er_2009 - min(entropy_er_2009))/(max(entropy_er_2009)-min(entropy_er_2009)),
         per_child_2009 = hhlds_child_num_2009/hhlds_child_den_2009,
         per_fb_2009 = foreign_born_2009/foreign_den_2009,
         per_nomove_2009 = hhld_nomove_2009/hhld_move_den_2009,
         vr_2009 = total_vacant_2009/total_hu_2009,
         hownrt_2009 = ifelse(hhs_2009 > 0, oo_hu_2009/hhs_2009, NA),
         per_bm_2009 = rowSums(cbind(ed_ba_m_2009,ed_ba_f_2009,
                                     ed_ma_m_2009,ed_ma_f_2009,
                                     ed_pd_m_2009,ed_pd_f_2009,
                                     ed_phd_m_2009,ed_phd_m_2009))/ed_den_2009,
         hhld_pov_rt_2009 = hhld_pov_num_2009/hhld_pov_den_2009,
         mlfp_2009 = rowSums(cbind(mlf_1619_2009,mlf_2021_2009,mlf_2224_2009,mlf_2529_2009,
                                   mlf_3034_2009,mlf_3544_2009,mlf_4554_2009,mlf_5559_2009,
                                   mlf_6061_2009,mlf_6264_2009,mlf_6569_2009,mlf_7074_2009,
                                   mlf_75u_2009)),
         flfp_2009 = rowSums(cbind(flp_1619_2009,flp_2021_2009,flp_2224_2009,flp_2529_2009,
                                   flp_3034_2009,flp_3544_2009,flp_4554_2009,flp_5559_2009,
                                   flp_6061_2009,flp_6264_2009,flp_6569_2009,flp_7074_2009,
                                   flp_75u_2009)),
         mu_2009 = rowSums(cbind(mup_1619_2009,mup_2021_2009,mup_2224_2009,mup_2529_2009,
                                 mup_3034_2009,mup_3544_2009,mup_4554_2009,mup_5559_2009,
                                 mup_6061_2009,mup_6264_2009,mup_6569_2009,mup_7074_2009,
                                 mup_75u_2009)),
         fu_2009 = rowSums(cbind(fup_1619_2009,fup_2021_2009,fup_2224_2009,fup_2529_2009,
                                 fup_3034_2009,fup_3544_2009,fup_4554_2009,fup_5559_2009,
                                 fup_6061_2009,fup_6264_2009,fup_6569_2009,fup_7074_2009,
                                 fup_75u_2009)),
         unemr_2009 = rowSums(cbind(mu_2009,fu_2009))/rowSums(cbind(mlfp_2009,flfp_2009)),
         pop_density_2022 = pop_total_2022/land_area_2022,
         per_18t34_2022 = rowSums(cbind(male_18t19_2022 , female_18t19_2022 ,male_20_2022 , female_20_2022 ,
                                        male_21_2022 , female_21_2022 , male_22t24_2022 , female_22t24_2022 , 
                                        male_25t29_2022 , female_25t29_2022 , male_30t34_2022 , female_30t34_2022),na.rm=T)/age_den_2022,
         per_35t64_2022 = rowSums(cbind(male_35t39_2022 , female_35t39_2022 , male_40t44_2022 , female_40t44_2022 , 
                                        male_45t49_2022 , female_45t49_2022 , male_50t54_2022 , female_50t54_2022 , 
                                        male_55t59_2022 , female_55t59_2022 , male_60t61_2022 , female_60t61_2022 , 
                                        male_62t64_2022 , female_62t64_2022),na.rm=T)/age_den_2022,
         per_65over_2022 = rowSums(cbind(male_65t66_2022 , female_65t66_2022 , male_67t69_2022 , female_67t69_2022 , 
                                         male_70t74_2022 , female_70t74_2022 , male_75t79_2022 , female_75t79_2022 , 
                                         male_80t84_2022 , female_80t84_2022 , male_85over_2022 , female_85over_2022),na.rm=T)/age_den_2022,
         per_aian_2022 = pop_nh_aian_2022/pop_total_2022,
         per_asian_2022 = pop_nh_asian_2022/pop_total_2022,
         per_black_2022 = pop_nh_black_2022/pop_total_2022,
         per_latino_2022 = pop_h_2022/pop_total_2022,
         per_other_2022 = rowSums(cbind(pop_nh_other_2022,
                                        pop_nh_nhpi_2022,
                                        pop_nh_multi_2022))/pop_total_2022,
         per_white_2022 = pop_nh_white_2022/pop_total_2022,
         log_asian_2022 = case_when(pop_nh_asian_2022 != 0 ~ log(1/per_asian_2022),
                                    pop_nh_asian_2022 == 0 ~ 0),
         log_black_2022 = case_when(pop_nh_black_2022 != 0 ~ log(1/per_black_2022),
                                    pop_nh_black_2022 == 0 ~ 0),
         log_latino_2022 = case_when(pop_h_2022!= 0 ~ log(1/per_latino_2022),
                                     pop_h_2022 == 0 ~ 0),
         log_white_2022 = case_when(pop_nh_white_2022 != 0 ~ log(1/per_white_2022),
                                    pop_nh_white_2022 == 0 ~ 0),
         log_aian_2022 = case_when(pop_nh_aian_2022 != 0 ~ log(1/per_aian_2022),
                                   pop_nh_aian_2022 == 0 ~ 0),
         log_other_2022 = case_when(rowSums(cbind(pop_nh_other_2022,
                                                  pop_nh_nhpi_2022,
                                                  pop_nh_multi_2022),na.rm=T) != 0 ~ log(1/per_other_2022),
                                    rowSums(cbind(pop_nh_other_2022,
                                                  pop_nh_nhpi_2022,
                                                  pop_nh_multi_2022),na.rm=T) == 0 ~ 0),
         entropy_er_2022 = per_asian_2022*log_asian_2022 + 
                           per_black_2022*log_black_2022 + 
                           per_latino_2022*log_latino_2022 + 
                           per_white_2022*log_white_2022 +
                           per_aian_2022*log_aian_2022 +
                           per_other_2022*log_other_2022,
         entropy_ers_2022 = (entropy_er_2022 - min(entropy_er_2022))/(max(entropy_er_2022)-min(entropy_er_2022)),
         per_child_2022 = hhlds_child_num_2022/hhlds_child_den_2022,
         per_fb_2022 = foreign_born_2022/foreign_den_2022,
         per_nomove_2022 = hhld_nomove_2022/hhld_move_den_2022,
         vr_2022 = total_vacant_2022/total_hu_2022,
         hownrt_2022 = ifelse(hhs_2022 > 0, oo_hu_2022/hhs_2022, NA),
         per_bm_2022 = rowSums(cbind(ed_ba_2022,ed_ma_2022,
                                     ed_pd_2022,ed_phd_2022))/ed_den_2022,
         hhld_pov_rt_2022 = hhld_pov_num_2022/hhld_pov_den_2022,
         mlfp_2022 = rowSums(cbind(mlf_1619_2022,mlf_2021_2022,mlf_2224_2022,mlf_2529_2022,
                                   mlf_3034_2022,mlf_3544_2022,mlf_4554_2022,mlf_5559_2022,
                                   mlf_6061_2022,mlf_6264_2022,mlf_6569_2022,mlf_7074_2022,
                                   mlf_75u_2022)),
         flfp_2022 = rowSums(cbind(flp_1619_2022,flp_2021_2022,flp_2224_2022,flp_2529_2022,
                                   flp_3034_2022,flp_3544_2022,flp_4554_2022,flp_5559_2022,
                                   flp_6061_2022,flp_6264_2022,flp_6569_2022,flp_7074_2022,
                                   flp_75u_2022)),
         mu_2022 = rowSums(cbind(mup_1619_2022,mup_2021_2022,mup_2224_2022,mup_2529_2022,
                                 mup_3034_2022,mup_3544_2022,mup_4554_2022,mup_5559_2022,
                                 mup_6061_2022,mup_6264_2022,mup_6569_2022,mup_7074_2022,
                                 mup_75u_2022)),
         fu_2022 = rowSums(cbind(fup_1619_2022,fup_2021_2022,fup_2224_2022,fup_2529_2022,
                                 fup_3034_2022,fup_3544_2022,fup_4554_2022,fup_5559_2022,
                                 fup_6061_2022,fup_6264_2022,fup_6569_2022,fup_7074_2022,
                                 fup_75u_2022)),
         unemr_2022 = rowSums(cbind(mu_2022,fu_2022))/rowSums(cbind(mlfp_2022,flfp_2022)),
         ## median hhld inc in 2009 = 51,425 * (437.6/316.7) = 71,056.46
         ## 30% of 71,056.46 = 21,316.94/12 = 1776.412 * 0.3 = 532.9236
         ## 50% of 71,056.46 = 35,528.23 = 2960.686 * 0.3 = 888.2058
         ## 80% of 71,056.46 = 56,845.17 = 4737.097 * 0.3 = 1421.129
         ## 
         ## 2009 eli lb = 349*(437.6/316.7) = 482.2305
         ## 2009 eli up = 399*(437.6/316.7) = 551.318
         eli_ushare_lb_2009 = rowSums(cbind(rent_cr_100_2009, rent_cr_149_2009, rent_cr_199_2009, rent_cr_249_2009,
                                            rent_cr_299_2009, rent_cr_349_2009))/rent_cr_2009,
         eli_ushare_ub_2009 = rowSums(cbind(rent_cr_100_2009, rent_cr_149_2009, rent_cr_199_2009, rent_cr_249_2009,
                                            rent_cr_299_2009, rent_cr_349_2009, rent_cr_399_2009))/rent_cr_2009,
         power_eli_2009 = (log(1-eli_ushare_lb_2009)-log(1-eli_ushare_ub_2009))/(log(551.318)-log(482.2305)),
         k_eli_2009 = ((eli_ushare_ub_2009-eli_ushare_lb_2009)/((1/482.2305^power_eli_2009)-(1/551.318^power_eli_2009)))^(1/power_eli_2009),
         eli_ushare_2009 = (1-(k_eli_2009/532.9236)^power_eli_2009),
         ## 2009 vli lb = 599*(437.6/316.7) = 827.6678
         ## 2009 vli up = 649*(437.6/316.7) = 896.7553
         vli_ushare_lb_2009 = rowSums(cbind(rent_cr_100_2009, rent_cr_149_2009, rent_cr_199_2009, rent_cr_249_2009,
                                            rent_cr_299_2009, rent_cr_349_2009, rent_cr_399_2009, rent_cr_449_2009,
                                            rent_cr_499_2009, rent_cr_549_2009, rent_cr_599_2009))/rent_cr_2009,
         vli_ushare_ub_2009 = rowSums(cbind(rent_cr_100_2009, rent_cr_149_2009, rent_cr_199_2009, rent_cr_249_2009,
                                            rent_cr_299_2009, rent_cr_349_2009, rent_cr_399_2009, rent_cr_449_2009,
                                            rent_cr_499_2009, rent_cr_549_2009, rent_cr_599_2009, rent_cr_649_2009))/rent_cr_2009,
         power_vli_2009 = (log(1-vli_ushare_lb_2009)-log(1-vli_ushare_ub_2009))/(log(896.7553)-log(827.6678)),
         k_vli_2009 = ((vli_ushare_ub_2009-vli_ushare_lb_2009)/((1/827.6678^power_vli_2009)-(1/896.7553^power_vli_2009)))^(1/power_vli_2009),
         vli_ushare_2009 = (1-(k_vli_2009/888.2058)^power_vli_2009),
         ## 2009 li lb = 999*(437.6/316.7) = 1380.368
         ## 2009 li up = 1249*(437.6/316.7) = 1725.805
         li_ushare_lb_2009 = rowSums(cbind(rent_cr_100_2009, rent_cr_149_2009, rent_cr_199_2009, rent_cr_249_2009,
                                           rent_cr_299_2009, rent_cr_349_2009, rent_cr_399_2009, rent_cr_449_2009,
                                           rent_cr_499_2009, rent_cr_549_2009, rent_cr_599_2009, rent_cr_649_2009,
                                           rent_cr_699_2009, rent_cr_749_2009, rent_cr_799_2009, rent_cr_899_2009,
                                           rent_cr_999_2009))/rent_cr_2009,
         li_ushare_ub_2009 = rowSums(cbind(rent_cr_100_2009, rent_cr_149_2009, rent_cr_199_2009, rent_cr_249_2009,
                                            rent_cr_299_2009, rent_cr_349_2009, rent_cr_399_2009, rent_cr_449_2009,
                                            rent_cr_499_2009, rent_cr_549_2009, rent_cr_599_2009, rent_cr_649_2009,
                                            rent_cr_699_2009, rent_cr_749_2009, rent_cr_799_2009, rent_cr_899_2009,
                                            rent_cr_999_2009, rent_cr_1249_2009))/rent_cr_2009,
         power_li_2009 = (log(1-li_ushare_lb_2009)-log(1-li_ushare_ub_2009))/(log(1725.805)-log(1380.368)),
         k_li_2009 = ((li_ushare_ub_2009-li_ushare_lb_2009)/((1/1380.368^power_li_2009)-(1/1725.805^power_li_2009)))^(1/power_li_2009),
         li_ushare_2009 = (1-(k_li_2009/1421.129)^power_li_2009),
         ## median hhld inc in 2022 = 75,149 
         ## 30% of 75,149 = 22,544.7/12 = 1878.725 * 0.3 = 563.6175
         ## 50% of 75,149 = 37,574.5/12 = 3131.208 * 0.3 = 939.3624
         ## 80% of 75,149 = 60,119.2/12 = 5009.933 * 0.3 = 1502.98
         ## 
         ## 2022 eli lb = 549
         ## 2022 eli up = 599
         eli_ushare_lb_2022 = rowSums(cbind(rent_cr_100_2022, rent_cr_149_2022, rent_cr_199_2022, rent_cr_249_2022,
                                            rent_cr_299_2022, rent_cr_349_2022, rent_cr_399_2022, rent_cr_449_2022,
                                            rent_cr_499_2022, rent_cr_549_2022))/rent_cr_2022,
         eli_ushare_ub_2022 = rowSums(cbind(rent_cr_100_2022, rent_cr_149_2022, rent_cr_199_2022, rent_cr_249_2022,
                                            rent_cr_299_2022, rent_cr_349_2022, rent_cr_399_2022, rent_cr_449_2022,
                                            rent_cr_499_2022, rent_cr_549_2022, rent_cr_599_2022))/rent_cr_2022,
         power_eli_2022 = (log(1-eli_ushare_lb_2022)-log(1-eli_ushare_ub_2022))/(log(599)-log(549)),
         k_eli_2022 = ((eli_ushare_ub_2022-eli_ushare_lb_2022)/((1/549^power_eli_2022)-(1/599^power_eli_2022)))^(1/power_eli_2022),
         eli_ushare_2022 = (1-(k_eli_2022/563.6175)^power_eli_2022),
         ## 2022 vli lb = 899
         ## 2022 vli up = 999
         vli_ushare_lb_2022 = rowSums(cbind(rent_cr_100_2022, rent_cr_149_2022, rent_cr_199_2022, rent_cr_249_2022,
                                            rent_cr_299_2022, rent_cr_349_2022, rent_cr_399_2022, rent_cr_449_2022,
                                            rent_cr_499_2022, rent_cr_549_2022, rent_cr_599_2022, rent_cr_649_2022,
                                            rent_cr_699_2022, rent_cr_749_2022, rent_cr_799_2022, rent_cr_899_2022))/rent_cr_2022,
         vli_ushare_ub_2022 = rowSums(cbind(rent_cr_100_2022, rent_cr_149_2022, rent_cr_199_2022, rent_cr_249_2022,
                                            rent_cr_299_2022, rent_cr_349_2022, rent_cr_399_2022, rent_cr_449_2022,
                                            rent_cr_499_2022, rent_cr_549_2022, rent_cr_599_2022, rent_cr_649_2022,
                                            rent_cr_699_2022, rent_cr_749_2022, rent_cr_799_2022, rent_cr_899_2022,
                                            rent_cr_999_2022))/rent_cr_2022,
         power_vli_2022 = (log(1-vli_ushare_lb_2022)-log(1-vli_ushare_ub_2022))/(log(999)-log(899)),
         k_vli_2022 = ((vli_ushare_ub_2022-vli_ushare_lb_2022)/((1/899^power_vli_2022)-(1/999^power_vli_2022)))^(1/power_vli_2022),
         vli_ushare_2022 = (1-(k_vli_2022/939.3624)^power_vli_2022),
         ## 2022 vli lb = 1499
         ## 2022 vli up = 1999
         li_ushare_lb_2022 = rowSums(cbind(rent_cr_100_2022, rent_cr_149_2022, rent_cr_199_2022, rent_cr_249_2022,
                                           rent_cr_299_2022, rent_cr_349_2022, rent_cr_399_2022, rent_cr_449_2022,
                                           rent_cr_499_2022, rent_cr_549_2022, rent_cr_599_2022, rent_cr_649_2022,
                                           rent_cr_699_2022, rent_cr_749_2022, rent_cr_799_2022, rent_cr_899_2022,
                                           rent_cr_999_2022, rent_cr_1249_2022))/rent_cr_2022,
         li_ushare_ub_2022 = rowSums(cbind(rent_cr_100_2022, rent_cr_149_2022, rent_cr_199_2022, rent_cr_249_2022,
                                           rent_cr_299_2022, rent_cr_349_2022, rent_cr_399_2022, rent_cr_449_2022,
                                           rent_cr_499_2022, rent_cr_549_2022, rent_cr_599_2022, rent_cr_649_2022,
                                           rent_cr_699_2022, rent_cr_749_2022, rent_cr_799_2022, rent_cr_899_2022,
                                           rent_cr_999_2022, rent_cr_1249_2022, rent_cr_1499_2022))/rent_cr_2022,
         power_li_2022 = (log(1-li_ushare_lb_2022)-log(1-li_ushare_ub_2022))/(log(1999)-log(1499)),
         k_li_2022 = ((li_ushare_ub_2022-li_ushare_lb_2022)/((1/1499^power_li_2022)-(1/1999^power_li_2022)))^(1/power_li_2022),
         li_ushare_2022 = (1-(k_li_2022/1502.98)^power_li_2022)) %>%
  select(CBSA,
         pop_total_2009,
         pop_density_2009,
         pop_nh_aian_2009,
         pop_nh_asian_2009,
         pop_nh_black_2009,
         pop_h_2009,
         pop_nh_nhpi_2009,
         pop_nh_other_2009,
         pop_nh_multi_2009,
         pop_nh_white_2009,
         per_18t34_2009,
         per_35t64_2009,
         per_65over_2009,
         per_aian_2009,
         per_asian_2009,
         per_black_2009,
         per_latino_2009,
         per_other_2009,
         per_white_2009,
         median_age_2009,
         per_child_2009,
         per_fb_2009,
         per_nomove_2009,
         vr_2009,
         hownrt_2009,
         per_bm_2009,
         hhld_pov_rt_2009,
         unemr_2009,
         median_gross_rent_2009,
         median_prop_value_2009,
         median_hhld_inc_2009,
         median_yr_str_2009,
         gini_2009,
         entropy_er_2009,
         entropy_ers_2009,
         pop_total_2022,
         pop_density_2022,
         pop_nh_aian_2022,
         pop_nh_asian_2022,
         pop_nh_black_2022,
         pop_h_2022,
         pop_nh_nhpi_2022,
         pop_nh_other_2022,
         pop_nh_multi_2022,
         pop_nh_white_2022,
         per_18t34_2022,
         per_35t64_2022,
         per_65over_2022,
         per_aian_2022,
         per_asian_2022,
         per_black_2022,
         per_latino_2022,
         per_other_2022,
         per_white_2022,
         median_age_2022,
         per_child_2022,
         per_fb_2022,
         per_nomove_2022,
         vr_2022,
         hownrt_2022,
         per_bm_2022,
         hhld_pov_rt_2022,
         unemr_2022,
         median_gross_rent_2022,
         median_prop_value_2022,
         median_yr_str_2022,
         median_hhld_inc_2022,
         gini_2022,
         entropy_er_2022,
         entropy_ers_2022,
         eli_ushare_2009,
         eli_ushare_2022,
         vli_ushare_2009,
         vli_ushare_2022,
         li_ushare_2009,
         li_ushare_2022,
         eli_ushare_lb_2009,
         eli_ushare_ub_2009,
         power_eli_2009,
         k_eli_2009,
         eli_ushare_2009,
         vli_ushare_lb_2009,
         vli_ushare_ub_2009,
         power_vli_2009,
         k_vli_2009,
         vli_ushare_2009,
         li_ushare_lb_2009,
         li_ushare_ub_2009,
         power_li_2009,
         k_li_2009,
         li_ushare_2009,
         eli_ushare_lb_2022,
         eli_ushare_ub_2022,
         power_eli_2022,
         k_eli_2022,
         eli_ushare_2022,
         vli_ushare_lb_2022,
         vli_ushare_ub_2022,
         power_vli_2022,
         k_vli_2022,
         vli_ushare_2022,
         li_ushare_lb_2022,
         li_ushare_ub_2022,
         power_li_2022,
         k_li_2022,
         li_ushare_2022)

## check the income shares ##

inc.shares.check <- msa.0922 %>%
  select(CBSA,
         eli_ushare_lb_2009,
         eli_ushare_ub_2009,
         power_eli_2009,
         k_eli_2009,
         eli_ushare_2009,
         vli_ushare_lb_2009,
         vli_ushare_ub_2009,
         power_vli_2009,
         k_vli_2009,
         vli_ushare_2009,
         li_ushare_lb_2009,
         li_ushare_ub_2009,
         power_li_2009,
         k_li_2009,
         li_ushare_2009,
         eli_ushare_lb_2022,
         eli_ushare_ub_2022,
         power_eli_2022,
         k_eli_2022,
         eli_ushare_2022,
         vli_ushare_lb_2022,
         vli_ushare_ub_2022,
         power_vli_2022,
         k_vli_2022,
         vli_ushare_2022,
         li_ushare_lb_2022,
         li_ushare_ub_2022,
         power_li_2022,
         k_li_2022,
         li_ushare_2022)

sum(msa.0922$eli_ushare_2009 >= msa.0922$eli_ushare_lb_2009 & 
    msa.0922$eli_ushare_2009 <= msa.0922$eli_ushare_ub_2009) == nrow(msa.0922)

sum(msa.0922$vli_ushare_2009 >= msa.0922$vli_ushare_lb_2009 & 
    msa.0922$vli_ushare_2009 <= msa.0922$vli_ushare_ub_2009) == nrow(msa.0922)

sum(msa.0922$li_ushare_2009 >= msa.0922$li_ushare_lb_2009 & 
    msa.0922$li_ushare_2009 <= msa.0922$li_ushare_ub_2009) == nrow(msa.0922)

sum(msa.0922$eli_ushare_2022 >= msa.0922$eli_ushare_lb_2022 & 
    msa.0922$eli_ushare_2022 <= msa.0922$eli_ushare_ub_2022) == nrow(msa.0922)

sum(msa.0922$vli_ushare_2022 >= msa.0922$vli_ushare_lb_2022 & 
    msa.0922$vli_ushare_2022 <= msa.0922$vli_ushare_ub_2022) == nrow(msa.0922)

sum(msa.0922$li_ushare_2022 >= msa.0922$li_ushare_lb_2022 & 
    msa.0922$li_ushare_2022 <= msa.0922$li_ushare_ub_2022) == nrow(msa.0922)

msa.0922$li_ushare_2022[msa.0922$CBSA == "41900"] <- 0.9909308

## try again ## 

sum(msa.0922$li_ushare_2022 >= msa.0922$li_ushare_lb_2022 - 0.001 & 
    msa.0922$li_ushare_2022 <= msa.0922$li_ushare_ub_2022 + 0.001) == nrow(msa.0922)


## keep only necessary variables ##

msa.0922.fin <- msa.0922 %>%
  select(-eli_ushare_lb_2009,
         -eli_ushare_ub_2009,
         -power_eli_2009,
         -k_eli_2009,
         -vli_ushare_lb_2009,
         -vli_ushare_ub_2009,
         -power_vli_2009,
         -k_vli_2009,
         -li_ushare_lb_2009,
         -li_ushare_ub_2009,
         -power_li_2009,
         -k_li_2009,
         -eli_ushare_lb_2022,
         -eli_ushare_ub_2022,
         -power_eli_2022,
         -k_eli_2022,
         -vli_ushare_lb_2022,
         -vli_ushare_ub_2022,
         -power_vli_2022,
         -k_vli_2022,
         -li_ushare_lb_2022,
         -li_ushare_ub_2022,
         -power_li_2022,
         -k_li_2022)

## checks ## 

summary(msa.0922.fin)

ct <- msa.0922.fin %>%
  filter(CBSA  %in% c("14860","25540","35300","35980"))

summary(ct)


## save final msa file ##

save(msa.0922.fin,
     file = paste0(output_path,
                   "003_msa_0922.Rda"))


## END OF PROGRAM ##

#sink()